import re
from pathlib import Path
from bs4 import BeautifulSoup

from day2.helper import download_html, download_image


class TedSpider(object):
    def __init__(self):
        self.url = 'https://www.ted.com/talks'
        self.talks = []
        self.image_folder = 'ted'
        self.html_name = 'ted.html'
        self.html_file = Path(self.html_name)

    def get_talk_links(self):
        download_html(self.html_name, self.url)  # request one time only
        soup = BeautifulSoup(self.html_file.read_text(), 'html.parser')
        talks_div = soup.find_all('div', class_='media media--sm-v')

        for talk_div in talks_div:
            info = {}
            talk_h4 = talk_div.find('h4', class_='f-w:700 h9 m5').find('a', class_='ga-link')
            info['img'] = talk_div.find('img').get('src')  # first img tag
            info['duration'] = talk_div.a.get_text().strip()  # first a tag - just another approach
            info['name'] = talk_h4.get_text().strip()
            info['link'] = talk_h4.get('href')
            self.talks.append(info)

    def download_images(self):
        for talk in self.talks:
            image_name = re.sub(f'([^\w])+', '-', talk['name']).lower()
            image_name = re.sub(f'(^-)|(-$)', '', image_name)
            image_name = f'{image_name}.jpg'
            download_image(self.image_folder, talk['img'], image_name)


if __name__ == '__main__':
    spider = TedSpider()
    spider.get_talk_links()
    spider.download_images()
